In [1]:
# Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import squarify
import scipy
from scipy import stats
from plotly.tools import FigureFactory as FF
import chart_studio.plotly as py

1. Purchase Behaviour

In [2]:
purchase=pd.read_csv("C:/Users/ypleow/Desktop/Virtual Intern/Quantium/Task 1/QVI_purchase_behaviour.csv");
purchase.head(2)
Out[2]:
LYLTY_CARD_NBR LIFESTAGE PREMIUM_CUSTOMER
0 1000 YOUNG SINGLES/COUPLES Premium
1 1002 YOUNG SINGLES/COUPLES Mainstream
In [3]:
purchase.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72637 entries, 0 to 72636
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   LYLTY_CARD_NBR    72637 non-null  int64 
 1   LIFESTAGE         72637 non-null  object
 2   PREMIUM_CUSTOMER  72637 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.7+ MB
In [4]:
purchase['LYLTY_CARD_NBR'].nunique()
Out[4]:
72637
In [5]:
purchase.describe(include='all')
Out[5]:
LYLTY_CARD_NBR LIFESTAGE PREMIUM_CUSTOMER
count 7.263700e+04 72637 72637
unique NaN 7 3
top NaN RETIREES Mainstream
freq NaN 14805 29245
mean 1.361859e+05 NaN NaN
std 8.989293e+04 NaN NaN
min 1.000000e+03 NaN NaN
25% 6.620200e+04 NaN NaN
50% 1.340400e+05 NaN NaN
75% 2.033750e+05 NaN NaN
max 2.373711e+06 NaN NaN
In [6]:
plt.figure(figsize=(6,8))
plt.bar(purchase['LIFESTAGE'].value_counts().index,purchase['LIFESTAGE'].value_counts(), 
        color = sns.color_palette("mako"))
plt.xticks(rotation = 90)
Out[6]:
([0, 1, 2, 3, 4, 5, 6], <a list of 7 Text xticklabel objects>)
In [7]:
customer_grouped = purchase.groupby('PREMIUM_CUSTOMER')['LYLTY_CARD_NBR'].count().sort_values(ascending=False)
customer_grouped
Out[7]:
PREMIUM_CUSTOMER
Mainstream    29245
Budget        24470
Premium       18922
Name: LYLTY_CARD_NBR, dtype: int64
In [8]:
plt.figure(figsize=(8,8))
colors = ['royalblue', 'darkslateblue', 'slategrey']
#colors = ['#ff9999','#66b3ff','#99ff99']
customer_grouped.plot.pie(autopct="%.1f%%", colors= colors)
Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0x17b24840188>
In [9]:
plt.figure(figsize=(6,8))
plt.bar(purchase['PREMIUM_CUSTOMER'].value_counts().index,purchase['PREMIUM_CUSTOMER'].value_counts(), 
        color = sns.color_palette("mako"))
plt.xticks(rotation = 45)
Out[9]:
([0, 1, 2], <a list of 3 Text xticklabel objects>)

2. Transaction

In [10]:
transaction=pd.read_excel("C:/Users/ypleow/Desktop/Virtual Intern/Quantium/Task 1/QVI_transaction_data.xlsx")
transaction.head(2)
Out[10]:
DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR PROD_NAME PROD_QTY TOT_SALES
0 43390 1 1000 1 5 Natural Chip Compny SeaSalt175g 2 6.0
1 43599 1 1307 348 66 CCs Nacho Cheese 175g 3 6.3
In [11]:
transaction.describe(include='all')
Out[11]:
DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR PROD_NAME PROD_QTY TOT_SALES
count 264836.000000 264836.00000 2.648360e+05 2.648360e+05 264836.000000 264836 264836.000000 264836.000000
unique NaN NaN NaN NaN NaN 114 NaN NaN
top NaN NaN NaN NaN NaN Kettle Mozzarella Basil & Pesto 175g NaN NaN
freq NaN NaN NaN NaN NaN 3304 NaN NaN
mean 43464.036260 135.08011 1.355495e+05 1.351583e+05 56.583157 NaN 1.907309 7.304200
std 105.389282 76.78418 8.057998e+04 7.813303e+04 32.826638 NaN 0.643654 3.083226
min 43282.000000 1.00000 1.000000e+03 1.000000e+00 1.000000 NaN 1.000000 1.500000
25% 43373.000000 70.00000 7.002100e+04 6.760150e+04 28.000000 NaN 2.000000 5.400000
50% 43464.000000 130.00000 1.303575e+05 1.351375e+05 56.000000 NaN 2.000000 7.400000
75% 43555.000000 203.00000 2.030942e+05 2.027012e+05 85.000000 NaN 2.000000 9.200000
max 43646.000000 272.00000 2.373711e+06 2.415841e+06 114.000000 NaN 200.000000 650.000000
In [12]:
transaction['PROD_NBR'].nunique()
Out[12]:
114
In [13]:
transaction['STORE_NBR'].nunique()
Out[13]:
272
In [14]:
transaction['TXN_ID'].nunique()
Out[14]:
263127
In [15]:
transaction.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264836 entries, 0 to 264835
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   DATE            264836 non-null  int64  
 1   STORE_NBR       264836 non-null  int64  
 2   LYLTY_CARD_NBR  264836 non-null  int64  
 3   TXN_ID          264836 non-null  int64  
 4   PROD_NBR        264836 non-null  int64  
 5   PROD_NAME       264836 non-null  object 
 6   PROD_QTY        264836 non-null  int64  
 7   TOT_SALES       264836 non-null  float64
dtypes: float64(1), int64(6), object(1)
memory usage: 16.2+ MB

Format data type - DATE

In [16]:
transaction["DATE"]=pd.to_datetime(transaction["DATE"], origin = "1899-12-30",unit="D")
In [17]:
transaction.head(2)
Out[17]:
DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR PROD_NAME PROD_QTY TOT_SALES
0 2018-10-17 1 1000 1 5 Natural Chip Compny SeaSalt175g 2 6.0
1 2019-05-14 1 1307 348 66 CCs Nacho Cheese 175g 3 6.3
In [18]:
transaction['DATE'].describe()
C:\Users\ypleow\anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning:

Treating datetime data as categorical rather than numeric in `.describe` is deprecated and will be removed in a future version of pandas. Specify `datetime_is_numeric=True` to silence this warning and adopt the future behavior now.

Out[18]:
count                  264836
unique                    364
top       2018-12-24 00:00:00
freq                      939
first     2018-07-01 00:00:00
last      2019-06-30 00:00:00
Name: DATE, dtype: object
In [19]:
pd.date_range(start = '2018-07-01', end = '2019-06-30' ).difference(transaction.DATE)
Out[19]:
DatetimeIndex(['2018-12-25'], dtype='datetime64[ns]', freq=None)

Since the "DATE" column has 364 unique value, it indicates that there is one missing date. 2018-12-25 is identified as the missing dates, this might because the shop is closed during the Chirstmas.

Outliers

In [20]:
transaction['PROD_QTY'].value_counts()
Out[20]:
2      236039
1       27518
5         450
3         430
4         397
200         2
Name: PROD_QTY, dtype: int64
In [21]:
transaction.drop(labels=transaction[transaction['PROD_QTY']==200].index,inplace=True)
In [22]:
transaction.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 264834 entries, 0 to 264835
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   DATE            264834 non-null  datetime64[ns]
 1   STORE_NBR       264834 non-null  int64         
 2   LYLTY_CARD_NBR  264834 non-null  int64         
 3   TXN_ID          264834 non-null  int64         
 4   PROD_NBR        264834 non-null  int64         
 5   PROD_NAME       264834 non-null  object        
 6   PROD_QTY        264834 non-null  int64         
 7   TOT_SALES       264834 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(5), object(1)
memory usage: 18.2+ MB

Create New Features - BRAND and PACKAGE_SIZE

In [23]:
transaction['PROD_NAME'].value_counts()
Out[23]:
Kettle Mozzarella   Basil & Pesto 175g      3304
Kettle Tortilla ChpsHny&Jlpno Chili 150g    3296
Cobs Popd Swt/Chlli &Sr/Cream Chips 110g    3269
Tyrrells Crisps     Ched & Chives 165g      3268
Cobs Popd Sea Salt  Chips 110g              3265
                                            ... 
RRD Pc Sea Salt     165g                    1431
Woolworths Medium   Salsa 300g              1430
NCC Sour Cream &    Garden Chives 175g      1419
French Fries Potato Chips 175g              1418
WW Crinkle Cut      Original 175g           1410
Name: PROD_NAME, Length: 114, dtype: int64
In [24]:
transaction['PACKAGE_SIZE']=transaction['PROD_NAME'].str.replace("[^\d]+", '')
In [25]:
transaction.head()
Out[25]:
DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR PROD_NAME PROD_QTY TOT_SALES PACKAGE_SIZE
0 2018-10-17 1 1000 1 5 Natural Chip Compny SeaSalt175g 2 6.0 175
1 2019-05-14 1 1307 348 66 CCs Nacho Cheese 175g 3 6.3 175
2 2019-05-20 1 1343 383 61 Smiths Crinkle Cut Chips Chicken 170g 2 2.9 170
3 2018-08-17 2 2373 974 69 Smiths Chip Thinly S/Cream&Onion 175g 5 15.0 175
4 2018-08-18 2 2426 1038 108 Kettle Tortilla ChpsHny&Jlpno Chili 150g 3 13.8 150
In [26]:
sorted(transaction['PROD_NAME'].unique())
Out[26]:
['Burger Rings 220g',
 'CCs Nacho Cheese    175g',
 'CCs Original 175g',
 'CCs Tasty Cheese    175g',
 'Cheetos Chs & Bacon Balls 190g',
 'Cheetos Puffs 165g',
 'Cheezels Cheese 330g',
 'Cheezels Cheese Box 125g',
 'Cobs Popd Sea Salt  Chips 110g',
 'Cobs Popd Sour Crm  &Chives Chips 110g',
 'Cobs Popd Swt/Chlli &Sr/Cream Chips 110g',
 'Dorito Corn Chp     Supreme 380g',
 'Doritos Cheese      Supreme 330g',
 'Doritos Corn Chip Mexican Jalapeno 150g',
 'Doritos Corn Chip Southern Chicken 150g',
 'Doritos Corn Chips  Cheese Supreme 170g',
 'Doritos Corn Chips  Nacho Cheese 170g',
 'Doritos Corn Chips  Original 170g',
 'Doritos Mexicana    170g',
 'Doritos Salsa       Medium 300g',
 'Doritos Salsa Mild  300g',
 'French Fries Potato Chips 175g',
 'Grain Waves         Sweet Chilli 210g',
 'Grain Waves Sour    Cream&Chives 210G',
 'GrnWves Plus Btroot & Chilli Jam 180g',
 'Infuzions BBQ Rib   Prawn Crackers 110g',
 'Infuzions Mango     Chutny Papadums 70g',
 'Infuzions SourCream&Herbs Veg Strws 110g',
 'Infuzions Thai SweetChili PotatoMix 110g',
 'Infzns Crn Crnchers Tangy Gcamole 110g',
 'Kettle 135g Swt Pot Sea Salt',
 'Kettle Chilli 175g',
 'Kettle Honey Soy    Chicken 175g',
 'Kettle Mozzarella   Basil & Pesto 175g',
 'Kettle Original 175g',
 'Kettle Sea Salt     And Vinegar 175g',
 'Kettle Sensations   BBQ&Maple 150g',
 'Kettle Sensations   Camembert & Fig 150g',
 'Kettle Sensations   Siracha Lime 150g',
 'Kettle Sweet Chilli And Sour Cream 175g',
 'Kettle Tortilla ChpsBtroot&Ricotta 150g',
 'Kettle Tortilla ChpsFeta&Garlic 150g',
 'Kettle Tortilla ChpsHny&Jlpno Chili 150g',
 'NCC Sour Cream &    Garden Chives 175g',
 'Natural Chip        Compny SeaSalt175g',
 'Natural Chip Co     Tmato Hrb&Spce 175g',
 'Natural ChipCo      Hony Soy Chckn175g',
 'Natural ChipCo Sea  Salt & Vinegr 175g',
 'Old El Paso Salsa   Dip Chnky Tom Ht300g',
 'Old El Paso Salsa   Dip Tomato Med 300g',
 'Old El Paso Salsa   Dip Tomato Mild 300g',
 'Pringles Barbeque   134g',
 'Pringles Chicken    Salt Crips 134g',
 'Pringles Mystery    Flavour 134g',
 'Pringles Original   Crisps 134g',
 'Pringles Slt Vingar 134g',
 'Pringles SourCream  Onion 134g',
 'Pringles Sthrn FriedChicken 134g',
 'Pringles Sweet&Spcy BBQ 134g',
 'RRD Chilli&         Coconut 150g',
 'RRD Honey Soy       Chicken 165g',
 'RRD Lime & Pepper   165g',
 'RRD Pc Sea Salt     165g',
 'RRD SR Slow Rst     Pork Belly 150g',
 'RRD Salt & Vinegar  165g',
 'RRD Steak &         Chimuchurri 150g',
 'RRD Sweet Chilli &  Sour Cream 165g',
 'Red Rock Deli Chikn&Garlic Aioli 150g',
 'Red Rock Deli SR    Salsa & Mzzrlla 150g',
 'Red Rock Deli Sp    Salt & Truffle 150G',
 'Red Rock Deli Thai  Chilli&Lime 150g',
 'Smith Crinkle Cut   Bolognese 150g',
 'Smith Crinkle Cut   Mac N Cheese 150g',
 'Smiths Chip Thinly  Cut Original 175g',
 'Smiths Chip Thinly  CutSalt/Vinegr175g',
 'Smiths Chip Thinly  S/Cream&Onion 175g',
 'Smiths Crinkle      Original 330g',
 'Smiths Crinkle Chips Salt & Vinegar 330g',
 'Smiths Crinkle Cut  Chips Barbecue 170g',
 'Smiths Crinkle Cut  Chips Chicken 170g',
 'Smiths Crinkle Cut  Chips Chs&Onion170g',
 'Smiths Crinkle Cut  Chips Original 170g',
 'Smiths Crinkle Cut  French OnionDip 150g',
 'Smiths Crinkle Cut  Salt & Vinegar 170g',
 'Smiths Crinkle Cut  Snag&Sauce 150g',
 'Smiths Crinkle Cut  Tomato Salsa 150g',
 'Smiths Crnkle Chip  Orgnl Big Bag 380g',
 'Smiths Thinly       Swt Chli&S/Cream175G',
 'Smiths Thinly Cut   Roast Chicken 175g',
 'Snbts Whlgrn Crisps Cheddr&Mstrd 90g',
 'Sunbites Whlegrn    Crisps Frch/Onin 90g',
 'Thins Chips         Originl saltd 175g',
 'Thins Chips Light&  Tangy 175g',
 'Thins Chips Salt &  Vinegar 175g',
 'Thins Chips Seasonedchicken 175g',
 'Thins Potato Chips  Hot & Spicy 175g',
 'Tostitos Lightly    Salted 175g',
 'Tostitos Smoked     Chipotle 175g',
 'Tostitos Splash Of  Lime 175g',
 'Twisties Cheese     270g',
 'Twisties Cheese     Burger 250g',
 'Twisties Chicken270g',
 'Tyrrells Crisps     Ched & Chives 165g',
 'Tyrrells Crisps     Lightly Salted 165g',
 'WW Crinkle Cut      Chicken 175g',
 'WW Crinkle Cut      Original 175g',
 'WW D/Style Chip     Sea Salt 200g',
 'WW Original Corn    Chips 200g',
 'WW Original Stacked Chips 160g',
 'WW Sour Cream &OnionStacked Chips 160g',
 'WW Supreme Cheese   Corn Chips 200g',
 'Woolworths Cheese   Rings 190g',
 'Woolworths Medium   Salsa 300g',
 'Woolworths Mild     Salsa 300g']
In [27]:
transaction['BRAND']=[a.split()[0] for a in transaction['PROD_NAME']]
In [28]:
sorted(transaction['BRAND'].unique())
Out[28]:
['Burger',
 'CCs',
 'Cheetos',
 'Cheezels',
 'Cobs',
 'Dorito',
 'Doritos',
 'French',
 'Grain',
 'GrnWves',
 'Infuzions',
 'Infzns',
 'Kettle',
 'NCC',
 'Natural',
 'Old',
 'Pringles',
 'RRD',
 'Red',
 'Smith',
 'Smiths',
 'Snbts',
 'Sunbites',
 'Thins',
 'Tostitos',
 'Twisties',
 'Tyrrells',
 'WW',
 'Woolworths']
In [29]:
transaction['BRAND'].replace('Dorito','Doritos',inplace=True)
transaction['BRAND'].replace('Infzns','Infuzions',inplace=True)
transaction['BRAND'].replace('Natural','NCC',inplace=True)
transaction['BRAND'].replace('Old','Old El Paso',inplace=True)
transaction['BRAND'].replace('Red','RRD',inplace=True)
transaction['BRAND'].replace('Smith','Smiths',inplace=True)
transaction['BRAND'].replace('Snbts','Sunbites',inplace=True)
transaction['BRAND'].replace('WW','Woolworths',inplace=True)
In [30]:
sorted(transaction['BRAND'].unique())
Out[30]:
['Burger',
 'CCs',
 'Cheetos',
 'Cheezels',
 'Cobs',
 'Doritos',
 'French',
 'Grain',
 'GrnWves',
 'Infuzions',
 'Kettle',
 'NCC',
 'Old El Paso',
 'Pringles',
 'RRD',
 'Smiths',
 'Sunbites',
 'Thins',
 'Tostitos',
 'Twisties',
 'Tyrrells',
 'Woolworths']

Create New Feature - PRICE (per packet)

In [31]:
transaction["PRICE"]= transaction["TOT_SALES"]/transaction["PROD_QTY"]
In [32]:
transaction.head()
Out[32]:
DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR PROD_NAME PROD_QTY TOT_SALES PACKAGE_SIZE BRAND PRICE
0 2018-10-17 1 1000 1 5 Natural Chip Compny SeaSalt175g 2 6.0 175 NCC 3.00
1 2019-05-14 1 1307 348 66 CCs Nacho Cheese 175g 3 6.3 175 CCs 2.10
2 2019-05-20 1 1343 383 61 Smiths Crinkle Cut Chips Chicken 170g 2 2.9 170 Smiths 1.45
3 2018-08-17 2 2373 974 69 Smiths Chip Thinly S/Cream&Onion 175g 5 15.0 175 Smiths 3.00
4 2018-08-18 2 2426 1038 108 Kettle Tortilla ChpsHny&Jlpno Chili 150g 3 13.8 150 Kettle 4.60
In [33]:
transaction.describe(include='all')
C:\Users\ypleow\anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning:

Treating datetime data as categorical rather than numeric in `.describe` is deprecated and will be removed in a future version of pandas. Specify `datetime_is_numeric=True` to silence this warning and adopt the future behavior now.

Out[33]:
DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR PROD_NAME PROD_QTY TOT_SALES PACKAGE_SIZE BRAND PRICE
count 264834 264834.000000 2.648340e+05 2.648340e+05 264834.000000 264834 264834.000000 264834.000000 264834 264834 264834.000000
unique 364 NaN NaN NaN NaN 114 NaN NaN 21 22 NaN
top 2018-12-24 00:00:00 NaN NaN NaN NaN Kettle Mozzarella Basil & Pesto 175g NaN NaN 175 Kettle NaN
freq 939 NaN NaN NaN NaN 3304 NaN NaN 66390 41288 NaN
first 2018-07-01 00:00:00 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
last 2019-06-30 00:00:00 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
mean NaN 135.079423 1.355488e+05 1.351576e+05 56.583554 NaN 1.905813 7.299346 NaN NaN 3.824629
std NaN 76.784063 8.057990e+04 7.813292e+04 32.826444 NaN 0.343436 2.527241 NaN NaN 1.109526
min NaN 1.000000 1.000000e+03 1.000000e+00 1.000000 NaN 1.000000 1.500000 NaN NaN 1.320000
25% NaN 70.000000 7.002100e+04 6.760050e+04 28.000000 NaN 2.000000 5.400000 NaN NaN 3.000000
50% NaN 130.000000 1.303570e+05 1.351365e+05 56.000000 NaN 2.000000 7.400000 NaN NaN 3.800000
75% NaN 203.000000 2.030940e+05 2.026998e+05 85.000000 NaN 2.000000 9.200000 NaN NaN 4.600000
max NaN 272.000000 2.373711e+06 2.415841e+06 114.000000 NaN 5.000000 29.500000 NaN NaN 6.500000
In [34]:
plt.figure(figsize=(10,6))
#plt.xticks(rotation=90)
#sns.countplot(transaction['PACKAGE_SIZE'])
plt.bar(transaction['PACKAGE_SIZE'].value_counts().index,transaction['PACKAGE_SIZE'].value_counts(),
       color = sns.color_palette("mako"))
plt.xticks(rotation = 90)
Out[34]:
([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
 <a list of 21 Text xticklabel objects>)
In [35]:
plt.figure(figsize=(10,6))
sns.barplot(x='PACKAGE_SIZE',y='PROD_QTY',data=transaction,estimator=np.sum)
plt.xticks(rotation = 45)
Out[35]:
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20]),
 <a list of 21 Text xticklabel objects>)
In [36]:
plt.figure(figsize=(10,6))
plt.bar(transaction['BRAND'].value_counts().index,transaction['BRAND'].value_counts(),
       color = sns.color_palette("mako"))
plt.xticks(rotation = 90)
Out[36]:
([0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21],
 <a list of 22 Text xticklabel objects>)
In [37]:
plt.figure(figsize=(10,6))
sns.barplot(x='BRAND',y='PROD_QTY',data=transaction,estimator=np.sum)
plt.xticks(rotation = 45)
Out[37]:
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21]),
 <a list of 22 Text xticklabel objects>)
In [38]:
grouped = transaction.groupby(['BRAND', 'PACKAGE_SIZE']).size().sort_values(ascending=False).head(10)
grouped
Out[38]:
BRAND        PACKAGE_SIZE
Pringles     134             25102
Kettle       175             19022
             150             19009
Thins        175             14075
Infuzions    110             12694
Doritos      170             12613
RRD          150             10372
Cobs         110              9693
Tostitos     175              9471
Old El Paso  300              9324
dtype: int64
In [39]:
plt.figure(figsize=(8, 8))
sns.heatmap(transaction.corr(),cmap='twilight',annot=True)
plt.xticks(rotation = 45)
plt.yticks(rotation = 45)
Out[39]:
(array([0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5]),
 <a list of 7 Text yticklabel objects>)

3. Combine Datasets

In [40]:
df=pd.merge(transaction,purchase,on='LYLTY_CARD_NBR')
df.head(2)
Out[40]:
DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR PROD_NAME PROD_QTY TOT_SALES PACKAGE_SIZE BRAND PRICE LIFESTAGE PREMIUM_CUSTOMER
0 2018-10-17 1 1000 1 5 Natural Chip Compny SeaSalt175g 2 6.0 175 NCC 3.0 YOUNG SINGLES/COUPLES Premium
1 2019-05-14 1 1307 348 66 CCs Nacho Cheese 175g 3 6.3 175 CCs 2.1 MIDAGE SINGLES/COUPLES Budget
In [41]:
df.info()
#df.isna().sum()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 264834 entries, 0 to 264833
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   DATE              264834 non-null  datetime64[ns]
 1   STORE_NBR         264834 non-null  int64         
 2   LYLTY_CARD_NBR    264834 non-null  int64         
 3   TXN_ID            264834 non-null  int64         
 4   PROD_NBR          264834 non-null  int64         
 5   PROD_NAME         264834 non-null  object        
 6   PROD_QTY          264834 non-null  int64         
 7   TOT_SALES         264834 non-null  float64       
 8   PACKAGE_SIZE      264834 non-null  object        
 9   BRAND             264834 non-null  object        
 10  PRICE             264834 non-null  float64       
 11  LIFESTAGE         264834 non-null  object        
 12  PREMIUM_CUSTOMER  264834 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(5), object(5)
memory usage: 28.3+ MB
In [42]:
df['PACKAGE_SIZE'] = df.PACKAGE_SIZE.astype(int)
In [43]:
df['TXN_ID'] = df.TXN_ID.astype(str)

Daily transaction and quantity sold

In [44]:
date_grouped = df.groupby('DATE').count()[['TXN_ID']]
date_grouped
Out[44]:
TXN_ID
DATE
2018-07-01 724
2018-07-02 711
2018-07-03 722
2018-07-04 714
2018-07-05 712
... ...
2019-06-26 723
2019-06-27 709
2019-06-28 730
2019-06-29 745
2019-06-30 744

364 rows × 1 columns

In [45]:
date1_grouped = df.groupby('DATE').sum()[['PROD_QTY']]
date1_grouped
Out[45]:
PROD_QTY
DATE
2018-07-01 1394
2018-07-02 1367
2018-07-03 1389
2018-07-04 1373
2018-07-05 1358
... ...
2019-06-26 1380
2019-06-27 1352
2019-06-28 1400
2019-06-29 1438
2019-06-30 1416

364 rows × 1 columns

In [46]:
# Create figure and plot space
fig, ax = plt.subplots(figsize=(16, 8))

# Add x-axis and y-axis
ax.bar(date_grouped.index.values,
       date_grouped['TXN_ID'],
       color='lightslategrey')


# twin object for two different y-axis on the sample plot
ax2=ax.twinx()
# make a plot with different y-axis using second axis object
ax2.plot(date1_grouped.index.values,
       date1_grouped['PROD_QTY'],
       color='crimson')
ax2.set_ylabel("Total Quantity",color="crimson")

ax2.set_ylim(ymin=1000)


# Set title and labels for axes
ax.set(xlabel="Date",
       title="Daily Transaction")

ax.set_ylabel("Total Transaction",color='lightslategrey')

ax.set_ylim(ymin=500)

plt.show()

Result: Spike of total transaction and total quantity sold in the end of year 2018. Total Quantity sold in mid of August 2018 and May 2019 drop significantly but the transaction remain around the range.

Grouping PACKAGE_SIZE

In [47]:
df.loc[df['PACKAGE_SIZE'] <= 300, 'PACKAGE_SIZECATEGORY'] = 'Between 150 and 300'
df.loc[df['PACKAGE_SIZE'] <= 150, 'PACKAGE_SIZECATEGORY'] = 'Below 150'
df.loc[df['PACKAGE_SIZE'] > 300, 'PACKAGE_SIZECATEGORY'] = 'Above 300'
In [48]:
df['PACKAGE_SIZECATEGORY'].value_counts()
Out[48]:
Between 150 and 300    146032
Below 150               99846
Above 300               18956
Name: PACKAGE_SIZECATEGORY, dtype: int64

Total Sales according to customer segments

In [49]:
premium_grouped = df.groupby('PREMIUM_CUSTOMER')['TOT_SALES'].sum().sort_values(ascending=False)
premium_grouped
Out[49]:
PREMIUM_CUSTOMER
Mainstream    750744.50
Budget        676211.55
Premium       506158.95
Name: TOT_SALES, dtype: float64
In [50]:
plt.figure(figsize=(8,8))
colors = ['steelblue','lightseagreen','mediumaquamarine']
premium_grouped.plot.pie(autopct="%.1f%%", colors=colors)
# Results: Mainstream has higher total sales followed by budget and premium.
Out[50]:
<matplotlib.axes._subplots.AxesSubplot at 0x17b2bdf9288>
In [51]:
plt.figure(figsize=(10,6))
sns.barplot(x='PREMIUM_CUSTOMER',y='TOT_SALES',data=df,estimator=np.sum, palette='viridis')
Out[51]:
<matplotlib.axes._subplots.AxesSubplot at 0x17b2a287588>
In [52]:
plt.figure(figsize=(10,6))
sns.barplot(x='LIFESTAGE',y='TOT_SALES',data=df,estimator=np.sum, palette='mako')
plt.xticks(rotation = 45)
# Results: Older's lifestage (Older Families, Older Singles/ Couples and Retirees) contribute the most total sales 
#          while New Families has lowest total sales.
Out[52]:
(array([0, 1, 2, 3, 4, 5, 6]), <a list of 7 Text xticklabel objects>)
In [53]:
df_grouped = df.groupby(['LIFESTAGE','PREMIUM_CUSTOMER']).sum()[['TOT_SALES']]
df_grouped
Out[53]:
TOT_SALES
LIFESTAGE PREMIUM_CUSTOMER
MIDAGE SINGLES/COUPLES Budget 35514.80
Mainstream 90803.85
Premium 58432.65
NEW FAMILIES Budget 21928.45
Mainstream 17013.90
Premium 11491.10
OLDER FAMILIES Budget 168363.25
Mainstream 103445.55
Premium 80658.40
OLDER SINGLES/COUPLES Budget 136769.80
Mainstream 133393.80
Premium 132263.15
RETIREES Budget 113147.80
Mainstream 155677.05
Premium 97646.05
YOUNG FAMILIES Budget 139345.85
Mainstream 92788.75
Premium 84025.50
YOUNG SINGLES/COUPLES Budget 61141.60
Mainstream 157621.60
Premium 41642.10
In [54]:
plt.figure(figsize=(16,8))
sns.barplot(y=df_grouped.reset_index()['TOT_SALES'],x=df_grouped.reset_index()['LIFESTAGE'],
            hue=df_grouped.reset_index()['PREMIUM_CUSTOMER'], palette='rocket')
Out[54]:
<matplotlib.axes._subplots.AxesSubplot at 0x17b2d280ec8>

Mainstream segment and Older Singles/ Couples segments have the highest total sales when looking the premium_customer and lifestage separately. When deep dive into smaller segmentation, Budget older families contribute the highest total sales followed by Mainstream young Singles/ Couples and Mainstram Retirees.

Per transaction

In [55]:
cust_grouped =df.groupby(['TXN_ID','LIFESTAGE','PREMIUM_CUSTOMER']).mean()[['PROD_QTY']]
cust_grouped
Out[55]:
PROD_QTY
TXN_ID LIFESTAGE PREMIUM_CUSTOMER
1 YOUNG SINGLES/COUPLES Premium 2.0
10 YOUNG SINGLES/COUPLES Mainstream 2.0
100 YOUNG SINGLES/COUPLES Budget 1.0
10000 OLDER SINGLES/COUPLES Mainstream 2.0
100000 OLDER FAMILIES Mainstream 2.0
... ... ... ...
99995 OLDER FAMILIES Premium 2.0
99996 OLDER FAMILIES Premium 2.0
99997 OLDER FAMILIES Premium 2.0
99998 OLDER FAMILIES Mainstream 2.0
99999 OLDER FAMILIES Mainstream 2.0

263127 rows × 1 columns

In [56]:
# Compare lifestage and premium customer by the quantity purchased
# Results: Older families, Young Families and Older Singles/Couples have overall higher average quantity purchased. 
#          Every categories has more than 1 average quantity purchased.

customer_grouped = cust_grouped.groupby(['LIFESTAGE','PREMIUM_CUSTOMER']).mean()[['PROD_QTY']]
customer_grouped
Out[56]:
PROD_QTY
LIFESTAGE PREMIUM_CUSTOMER
MIDAGE SINGLES/COUPLES Budget 1.891339
Mainstream 1.911575
Premium 1.889277
NEW FAMILIES Budget 1.853226
Mainstream 1.857389
Premium 1.860480
OLDER FAMILIES Budget 1.945389
Mainstream 1.948168
Premium 1.945513
OLDER SINGLES/COUPLES Budget 1.913201
Mainstream 1.910273
Premium 1.914184
RETIREES Budget 1.892146
Mainstream 1.887352
Premium 1.900046
YOUNG FAMILIES Budget 1.940380
Mainstream 1.940272
Premium 1.937413
YOUNG SINGLES/COUPLES Budget 1.803994
Mainstream 1.852139
Premium 1.803691
In [57]:
plt.figure(figsize=(16,8))
sns.barplot(y=customer_grouped.reset_index()['PROD_QTY'],x=customer_grouped.reset_index()['LIFESTAGE'],
            hue=df_grouped.reset_index()['PREMIUM_CUSTOMER'], palette='mako')
plt.ylim(1.6, 2)
Out[57]:
(1.6, 2)
In [58]:
premium_grouped = cust_grouped.groupby(['PREMIUM_CUSTOMER']).mean()[['PROD_QTY']]
#premium_grouped
premium_grouped.sort_values("PROD_QTY",ascending=False)
Out[58]:
PROD_QTY
PREMIUM_CUSTOMER
Budget 1.909320
Premium 1.906228
Mainstream 1.901532
In [59]:
lifestage_grouped = df.groupby(['LIFESTAGE']).mean()[['PROD_QTY']]
#lifestage_grouped
lifestage_grouped.sort_values("PROD_QTY",ascending=False)
Out[59]:
PROD_QTY
LIFESTAGE
OLDER FAMILIES 1.946578
YOUNG FAMILIES 1.939828
OLDER SINGLES/COUPLES 1.912719
MIDAGE SINGLES/COUPLES 1.900478
RETIREES 1.892289
NEW FAMILIES 1.856771
YOUNG SINGLES/COUPLES 1.831762

Older Families and Young Families has higher quantity purchased per transaction.

Price

In [60]:
size_grouped = df.groupby(['LIFESTAGE','PACKAGE_SIZECATEGORY'])['LIFESTAGE'].count()
size_grouped
Out[60]:
LIFESTAGE               PACKAGE_SIZECATEGORY
MIDAGE SINGLES/COUPLES  Above 300                1848
                        Below 150                9476
                        Between 150 and 300     13786
NEW FAMILIES            Above 300                 504
                        Below 150                2696
                        Between 150 and 300      3719
OLDER FAMILIES          Above 300                3277
                        Below 150               17874
                        Between 150 and 300     27443
OLDER SINGLES/COUPLES   Above 300                3955
                        Below 150               20857
                        Between 150 and 300     29667
RETIREES                Above 300                3680
                        Below 150               19030
                        Between 150 and 300     27053
YOUNG FAMILIES          Above 300                2934
                        Below 150               16087
                        Between 150 and 300     24571
YOUNG SINGLES/COUPLES   Above 300                2758
                        Below 150               13826
                        Between 150 and 300     19793
Name: LIFESTAGE, dtype: int64
In [61]:
size1_grouped = df.groupby(['PREMIUM_CUSTOMER','PACKAGE_SIZECATEGORY'])['PREMIUM_CUSTOMER'].count()
size1_grouped
Out[61]:
PREMIUM_CUSTOMER  PACKAGE_SIZECATEGORY
Budget            Above 300                6527
                  Below 150               34830
                  Between 150 and 300     51800
Mainstream        Above 300                7522
                  Below 150               38771
                  Between 150 and 300     55695
Premium           Above 300                4907
                  Below 150               26245
                  Between 150 and 300     38537
Name: PREMIUM_CUSTOMER, dtype: int64
In [62]:
size2_grouped = df.groupby('PACKAGE_SIZECATEGORY')['PACKAGE_SIZECATEGORY'].count().sort_values(ascending=False)
size2_grouped
Out[62]:
PACKAGE_SIZECATEGORY
Between 150 and 300    146032
Below 150               99846
Above 300               18956
Name: PACKAGE_SIZECATEGORY, dtype: int64
In [63]:
a = df.groupby('PACKAGE_SIZECATEGORY')['PACKAGE_SIZECATEGORY'].count().sort_values(ascending=False).index.get_level_values(0).tolist()
color=['lightpink','aliceblue','thistle']
squarify.plot(size2_grouped, color = color, label = a)
plt.show()

Package size between 150 and 300 has the most quantity of purchased.

In [64]:
price_grouped = df.groupby(['LIFESTAGE','PREMIUM_CUSTOMER']).mean()[['PRICE']]
price_grouped.sort_values("PRICE",ascending=False)
Out[64]:
PRICE
LIFESTAGE PREMIUM_CUSTOMER
YOUNG SINGLES/COUPLES Mainstream 4.071485
MIDAGE SINGLES/COUPLES Mainstream 4.000101
RETIREES Budget 3.924883
Premium 3.921323
NEW FAMILIES Budget 3.919251
Mainstream 3.916581
OLDER SINGLES/COUPLES Premium 3.887220
Budget 3.877022
NEW FAMILIES Premium 3.871743
RETIREES Mainstream 3.833343
OLDER SINGLES/COUPLES Mainstream 3.803800
YOUNG FAMILIES Budget 3.753659
MIDAGE SINGLES/COUPLES Premium 3.752915
YOUNG FAMILIES Premium 3.752402
OLDER FAMILIES Budget 3.733344
MIDAGE SINGLES/COUPLES Budget 3.728496
OLDER FAMILIES Mainstream 3.727383
YOUNG FAMILIES Mainstream 3.707097
OLDER FAMILIES Premium 3.704625
YOUNG SINGLES/COUPLES Premium 3.645518
Budget 3.637681
In [65]:
plt.figure(figsize=(16,8))
sns.barplot(y=price_grouped.reset_index()['PRICE'],x=price_grouped.reset_index()['LIFESTAGE'],
            hue=price_grouped.reset_index()['PREMIUM_CUSTOMER'], palette='mako')
plt.ylim(3,4.2)
Out[65]:
(3, 4.2)

Mainstream Young Singles/ Couples and Midage Singles/ Couples are willing to spend more per packet of chips.

T-test (Young & Midage Singles/ Couples) - To check whether the segments are statistically different (Price)

In [66]:
# Mainstream vs Premium
twosample_results = scipy.stats.ttest_ind([4.071485,4.000101],[3.645518,3.752915])

matrix_twosample = [
    ['Young & Midage Singles/ Couples', 'Test Statistic', 'p-value'],
    ['Mainstream vs Premium', twosample_results[0], twosample_results[1]]]

twosample_table = FF.create_table(matrix_twosample, index=True)
twosample_table.show()
C:\Users\ypleow\anaconda3\lib\site-packages\plotly\tools.py:592: UserWarning:

plotly.tools.FigureFactory.create_table is deprecated. Use plotly.figure_factory.create_table

In [67]:
# Mainstream vs Budget
twosample_results = scipy.stats.ttest_ind([4.071485,4.000101],[3.637681,3.728496])

matrix_twosample = [
    ['Young & Midage Singles/ Couples', 'Test Statistic', 'p-value'],
    ['Mainstream vs Budget', twosample_results[0], twosample_results[1]]]

twosample_table = FF.create_table(matrix_twosample, index=True)
twosample_table.show()
C:\Users\ypleow\anaconda3\lib\site-packages\plotly\tools.py:592: UserWarning:

plotly.tools.FigureFactory.create_table is deprecated. Use plotly.figure_factory.create_table

The p-values are below 0.05, hence the average per packet price for mainstream is higher than premium and budget.

In [68]:
young=df[(df['PREMIUM_CUSTOMER']=='Mainstream') & (df['LIFESTAGE']=='YOUNG SINGLES/COUPLES')]
columns = ['BRAND', 'PACKAGE_SIZE']
young = pd.DataFrame(young, columns=columns)
young.groupby(['BRAND','PACKAGE_SIZE'])['BRAND'].count().sort_values(ascending=False).head(5)
#print(f"YOUNG SINGLES/COUPLES\n BRAND\n{young['BRAND'].value_counts().head(5)}")
Out[68]:
BRAND      PACKAGE_SIZE
Pringles   134             2315
Kettle     150             1778
           175             1776
Doritos    170             1211
Infuzions  110             1187
Name: BRAND, dtype: int64
In [69]:
midage=df[(df['PREMIUM_CUSTOMER']=='Mainstream') & (df['LIFESTAGE']=='MIDAGE SINGLES/COUPLES')]
columns = ['BRAND', 'PACKAGE_SIZE']
midage = pd.DataFrame(midage, columns=columns)
midage.groupby(['BRAND','PACKAGE_SIZE'])['BRAND'].count().sort_values(ascending=False).head(5)
#print(f"MIDAGE SINGLES/COUPLES\n BRAND\n{midage['BRAND'].value_counts().head(5)}")
Out[69]:
BRAND      PACKAGE_SIZE
Pringles   134             1159
Kettle     175             1022
           150              951
Thins      175              635
Infuzions  110              629
Name: BRAND, dtype: int64

Both young and midage singles/ couples in mainstream prefered Pringles with 134g package size followes by Kettle with 150 and 175g package size.